Overall Scores¶

In [5]:
import pandas as pd
import matplotlib.pyplot as plt

merged_scores = pd.read_csv("../FINAL PRE-POST DATASET.csv", sep=';', decimal=',')

overall_scores = merged_scores.melt(
    id_vars=["CompletionCode"], 
    value_vars=["PreOverallScore", "PostOverallScore"], 
    var_name="Test Phase", 
    value_name="Score"
)

overall_scores["Test Phase"] = overall_scores["Test Phase"].replace(
    {"PreOverallScore": "Pre-Test", "PostOverallScore": "Post-Test"}
)

overall_scores["Score"] = overall_scores["Score"].astype(float) * 100

pretest_mean = overall_scores[overall_scores["Test Phase"] == "Pre-Test"]["Score"].mean()
posttest_mean = overall_scores[overall_scores["Test Phase"] == "Post-Test"]["Score"].mean()

print(f"Pre-Test Mean: {pretest_mean:.2f}%")
print(f"Post-Test Mean: {posttest_mean:.2f}%")

plt.figure(figsize=(8, 6))
plt.boxplot([
    overall_scores[overall_scores["Test Phase"] == "Post-Test"]["Score"],
    overall_scores[overall_scores["Test Phase"] == "Pre-Test"]["Score"]
], vert=False, labels=["Post-Test", "Pre-Test"])

plt.title("Overall Correctness Scores: Pre-Test vs. Post-Test")
plt.xlabel("Correctness Score (%)")
plt.xlim(0, 100)
plt.grid(axis='x', linestyle='--', alpha=0.7)

plt.show()
Pre-Test Mean: 64.56%
Post-Test Mean: 67.60%
No description has been provided for this image

Stats per Chart Type, Material, and School¶

In [27]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("../FINAL PRE-POST DATASET.csv", sep=';', decimal=',')

score_columns = [col for col in df.columns if "Score" in col]
df[score_columns] = df[score_columns] * 100

sns.set_style("whitegrid")

color_palette = {
    "LineChart": "blue",
    "AreaChart": "green",
    "StackedAreaChart": "orange",
    "Streamgraph": "red"
}

def create_boxplot_fixed_colors(data, x, y, hue=None, order=None, title="", xlabel="", ylabel=""):
    plt.figure(figsize=(10, 3.5))

    if hue == "Ordered Label":
        data["HueGroup"] = data[hue].str.extract(r"(LineChart|AreaChart|StackedAreaChart|Streamgraph)")
        ax = sns.boxplot(data=data, x=x, y=y, order=order, hue="HueGroup", palette=color_palette)
    else:
        ax = sns.boxplot(data=data, x=x, y=y, order=order, hue=hue)

    for median in ax.artists:
        median.set_edgecolor("black")
        median.set_linewidth(1.5)

    for line in ax.lines:
        if line.get_linestyle() == '-':  
            line.set_linewidth(2.5)

    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xlim(0, 100)
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    if hue:
        plt.legend(title=hue, loc="upper left", fontsize="small")
    plt.show()

viz_types = ["LineChart", "AreaChart", "StackedAreaChart", "Streamgraph"]
mean_scores = {
    viz: {
        "Pre-Test Mean": df[f"Pre{viz}Score"].mean(),
        "Post-Test Mean": df[f"Post{viz}Score"].mean(),
    }
    for viz in viz_types
}

print("Mean Scores for Each Visualization Type:")
for viz, scores in mean_scores.items():
    print(f"{viz} Pre-Test: {scores['Pre-Test Mean']:.2f}% | {viz} Post-Test: {scores['Post-Test Mean']:.2f}%")

# Box Plot 1: Overall Pre-Test vs. Post-Test Scores
overall_scores = df.melt(
    id_vars=["CompletionCode"], 
    value_vars=["PreOverallScore", "PostOverallScore"], 
    var_name="Test Phase", 
    value_name="Score"
)
overall_scores["Test Phase"] = overall_scores["Test Phase"].replace(
    {"PreOverallScore": "Pre-Test", "PostOverallScore": "Post-Test"}
)

create_boxplot_fixed_colors(
    overall_scores, "Score", "Test Phase",
    title="Overall Correctness Scores: Pre-Test vs. Post-Test",
    xlabel="Correctness Score (%)", ylabel="Test Phase"
)

# Box Plot 2: Pre-Test vs. Post-Test per Visualization Type
viz_scores = df.melt(
    id_vars=["CompletionCode"], 
    value_vars=[f"Pre{viz}Score" for viz in viz_types] + [f"Post{viz}Score" for viz in viz_types], 
    var_name="Test Phase", 
    value_name="Score"
)
viz_scores["Visualization Type"] = viz_scores["Test Phase"].str.extract(r"(LineChart|AreaChart|StackedAreaChart|Streamgraph)")
viz_scores["Test Phase"] = viz_scores["Test Phase"].str.replace(r"(Pre|Post)(.*)", r"\1-Test", regex=True)

viz_order = [
    "LineChart Pre-Test", "LineChart Post-Test",
    "AreaChart Pre-Test", "AreaChart Post-Test",
    "StackedAreaChart Pre-Test", "StackedAreaChart Post-Test",
    "Streamgraph Pre-Test", "Streamgraph Post-Test"
]
viz_scores["Ordered Label"] = viz_scores["Visualization Type"] + " " + viz_scores["Test Phase"]

create_boxplot_fixed_colors(
    viz_scores, "Score", "Ordered Label", hue="Ordered Label", order=viz_order,
    title="Pre-Test vs. Post-Test Scores per Visualization Type",
    xlabel="Correctness Score (%)", ylabel="Visualization Type & Test Phase"
)

# Box Plot 3: Pre-Test vs. Post-Test per Learning Material
group_scores = df.melt(
    id_vars=["CompletionCode", "Gruppe"], 
    value_vars=["PreOverallScore", "PostOverallScore"], 
    var_name="Test Phase", 
    value_name="Score"
)
group_scores["Test Phase"] = group_scores["Test Phase"].replace(
    {"PreOverallScore": "Pre-Test", "PostOverallScore": "Post-Test"}
)

create_boxplot_fixed_colors(
    group_scores, "Score", "Gruppe", hue="Test Phase",
    title="Pre-Test vs. Post-Test Scores per Learning Material",
    xlabel="Correctness Score (%)", ylabel="Learning Material"
)

# Box Plot 4: Pre-Test vs. Post-Test per School (Test ID)
school_scores = df.melt(
    id_vars=["CompletionCode", "Test ID"], 
    value_vars=["PreOverallScore", "PostOverallScore"], 
    var_name="Test Phase", 
    value_name="Score"
)
school_scores["Test Phase"] = school_scores["Test Phase"].replace(
    {"PreOverallScore": "Pre-Test", "PostOverallScore": "Post-Test"}
)

create_boxplot_fixed_colors(
    school_scores, "Score", "Test ID", hue="Test Phase",
    title="Pre-Test vs. Post-Test Scores per School",
    xlabel="Correctness Score (%)", ylabel="School"
)
Mean Scores for Each Visualization Type:
LineChart Pre-Test: 85.33% | LineChart Post-Test: 81.33%
AreaChart Pre-Test: 60.22% | AreaChart Post-Test: 60.44%
StackedAreaChart Pre-Test: 54.67% | StackedAreaChart Post-Test: 69.78%
Streamgraph Pre-Test: 49.44% | Streamgraph Post-Test: 68.33%
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Wilcoxon Signed Rank Test for Pre- and Post Test Scores per Chart Type¶

In [28]:
import pandas as pd
from scipy.stats import wilcoxon

df = pd.read_csv("../FINAL PRE-POST DATASET.csv", sep=';', decimal=',')

viz_types = ["LineChart", "AreaChart", "StackedAreaChart", "Streamgraph"]
wilcoxon_results = {}

for viz in viz_types:
    pre_scores = df[f"Pre{viz}Score"]
    post_scores = df[f"Post{viz}Score"]

    differences = post_scores - pre_scores
    nonzero_indices = differences != 0  

    stat, p = wilcoxon(pre_scores[nonzero_indices], post_scores[nonzero_indices], alternative='two-sided')

    if p < 0.01:
        interpretation = "Highly significant improvement" if post_scores.mean() > pre_scores.mean() else "Highly significant decline"
    elif p < 0.05:
        interpretation = "Significant improvement" if post_scores.mean() > pre_scores.mean() else "Significant decline"
    else:
        interpretation = "No significant change"

    wilcoxon_results[viz] = {"Test Statistic": stat, "p-value": p, "Interpretation": interpretation}

wilcoxon_df = pd.DataFrame.from_dict(wilcoxon_results, orient="index")

print("\nWilcoxon Signed-Rank Test Results:")
print(wilcoxon_df)
Wilcoxon Signed-Rank Test Results:
                  Test Statistic   p-value                  Interpretation
LineChart                  146.5  0.483352           No significant change
AreaChart                  295.5  0.986539           No significant change
StackedAreaChart           100.0  0.000060  Highly significant improvement
Streamgraph                 78.0  0.001816  Highly significant improvement

Kruskal-Wallis for Material vs Score¶

In [29]:
import pandas as pd
from scipy.stats import kruskal

df = pd.read_csv("../FINAL PRE-POST DATASET.csv", sep=';', decimal=',')

df["Improvement"] = df["PostOverallScore"] - df["PreOverallScore"]

improvement_groups = [group["Improvement"].values for _, group in df.groupby("Gruppe")]
kw_improvement = kruskal(*improvement_groups)

posttest_groups = [group["PostOverallScore"].values for _, group in df.groupby("Gruppe")]
kw_posttest = kruskal(*posttest_groups)

k = df["Gruppe"].nunique()
N = len(df)

eta_squared_improvement = (kw_improvement.statistic - (k - 1)) / (N - k)
eta_squared_posttest = (kw_posttest.statistic - (k - 1)) / (N - k)

print("Kruskal-Wallis Test on Improvement Scores:")
print(f"H-statistic = {kw_improvement.statistic:.5f}, p-value = {kw_improvement.pvalue:.5f}")
print(f"Effect Size (η²) = {eta_squared_improvement:.5f}\n")

print("Kruskal-Wallis Test on Post-Test Scores:")
print(f"H-statistic = {kw_posttest.statistic:.5f}, p-value = {kw_posttest.pvalue:.5f}")
print(f"Effect Size (η²) = {eta_squared_posttest:.5f}")
Kruskal-Wallis Test on Improvement Scores:
H-statistic = 5.71405, p-value = 0.12638
Effect Size (η²) = 0.06620

Kruskal-Wallis Test on Post-Test Scores:
H-statistic = 2.52040, p-value = 0.47162
Effect Size (η²) = -0.01170

Dunn's Post-Hoc Test¶

The Kruskal-Wallis test is not significant but there's a small-to-moderate effect size for the improvement scores -> Look which material might have contributed. Dunn's Test for Pairwise Comparisons using Holm-Bonferoni correction to control for multiple comparisons

In [30]:
import pandas as pd
import scikit_posthocs as sp
import numpy as np
from scipy.stats import kruskal

df = pd.read_csv("../FINAL PRE-POST DATASET.csv", sep=';', decimal=',')

df["Improvement"] = df["PostOverallScore"] - df["PreOverallScore"]

kw_improvement = kruskal(*[group["Improvement"].values for _, group in df.groupby("Gruppe")])

print(f"Kruskal-Wallis H-statistic = {kw_improvement.statistic:.5f}, p-value = {kw_improvement.pvalue:.5f}")

dunn_results = sp.posthoc_dunn(df, val_col="Improvement", group_col="Gruppe", p_adjust="holm")

print("\nDunn's Test (Pairwise Comparisons of Learning Materials):")
print(dunn_results)
Kruskal-Wallis H-statistic = 5.71405, p-value = 0.12638

Dunn's Test (Pairwise Comparisons of Learning Materials):
              Comic      Game  Schulbuch     Video
Comic      1.000000  1.000000   1.000000  0.136068
Game       1.000000  1.000000   1.000000  0.323652
Schulbuch  1.000000  1.000000   1.000000  0.489282
Video      0.136068  0.323652   0.489282  1.000000

Interpretation: No statistically significant difference. Comic performed the lowest and video the highest, hence the smallest p-value, but there's no real effect.

Individual Improvements of Students¶

In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, kruskal

df_new = pd.read_csv("../FINAL PRE-POST DATASET.csv", sep=';', decimal=',')

score_columns = ["PreOverallScore", "PostOverallScore"]
df_new[score_columns] = df_new[score_columns] * 100

df_new["Score Change"] = df_new["PostOverallScore"] - df_new["PreOverallScore"]

count_improved = (df_new["Score Change"] > 0).sum()
count_same = (df_new["Score Change"] == 0).sum()
count_declined = (df_new["Score Change"] < 0).sum()

print("Student Performance Summary:")
print(f"Improved: {count_improved}")
print(f"Stayed the Same: {count_same}")
print(f"Declined: {count_declined}\n")

df_new["Performance Group"] = pd.qcut(df_new["PreOverallScore"], q=3, labels=["Low", "Medium", "High"])

group_means_new = df_new.groupby("Performance Group", observed=True)["Score Change"].agg(["mean", "std", "count"])
print("Mean Improvement by Performance Group:")
print(group_means_new, "\n")

plt.figure(figsize=(8, 5))
sns.scatterplot(data=df_new, x="PreOverallScore", y="Score Change", alpha=0.6)
plt.axhline(0, color="red", linestyle="--")
plt.title("Scatterplot of Pre-Test Scores vs. Score Improvement")
plt.xlabel("Pre-Test Score (%)")
plt.ylabel("Score Improvement (%)")
plt.show()

plt.figure(figsize=(8, 5))
sns.boxplot(data=df_new, x="Score Change", y="Performance Group", order=["Low", "Medium", "High"])
plt.axvline(0, color="red", linestyle="--")
plt.title("Score Improvement by Pre-Test Performance Group")
plt.xlabel("Score Improvement (%)")
plt.ylabel("Pre-Test Performance Group")
plt.show()

groups_new = [df_new[df_new["Performance Group"] == g]["Score Change"] for g in df_new["Performance Group"].unique()]
kruskal_stat_new, kruskal_p_new = kruskal(*groups_new)

print("Kruskal-Wallis Test for Performance Group Differences:")
print(f"Statistic: {kruskal_stat_new:.4f}, p-value: {kruskal_p_new:.4f} {'(Significant)' if kruskal_p_new < 0.05 else '(Not Significant)'}\n")

spearman_corr_new, spearman_p_new = spearmanr(df_new["PreOverallScore"], df_new["Score Change"])
print("Spearman Correlation Between Pre-Test Score and Score Improvement:")
print(f"Spearman Correlation: {spearman_corr_new:.4f}, p-value: {spearman_p_new:.4f} {'(Significant)' if spearman_p_new < 0.05 else '(Not Significant)'}\n")
Student Performance Summary:
Improved: 25
Stayed the Same: 6
Declined: 14

Mean Improvement by Performance Group:
                       mean        std  count
Performance Group                            
Low                4.953560  19.646335     17
Medium             8.771930  12.685765     15
High              -6.072874  17.168729     13 

No description has been provided for this image
No description has been provided for this image
Kruskal-Wallis Test for Performance Group Differences:
Statistic: 5.3180, p-value: 0.0700 (Not Significant)

Spearman Correlation Between Pre-Test Score and Score Improvement:
Spearman Correlation: -0.2486, p-value: 0.0996 (Not Significant)

Interpretation: Kruskal-Wallis Test shows that differences in improvement of different performance groups are not statistically significant (p = 0.07). It's close to 0.05 though, so there might be a weak trend. Spearman Correlation (p = 0.099, r = -0.249) shows that students with lower pre-test scores might have improved more, but the trend is not statistically significant.

Gender and Age vs. Individual Improvement¶

In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu, spearmanr

df_new = pd.read_csv("../FINAL PRE-POST DATASET.csv", sep=';', decimal=',')

score_columns = ["PreOverallScore", "PostOverallScore"]
df_new[score_columns] = df_new[score_columns] * 100

df_new["Score Change"] = df_new["PostOverallScore"] - df_new["PreOverallScore"]

df_new["Geschlecht"] = df_new["Geschlecht"].astype(str)
df_new["Alter"] = pd.to_numeric(df_new["Alter"], errors="coerce")

df_gender_filtered = df_new[df_new["Geschlecht"].isin(["Männlich", "Weiblich"])]

male_scores = df_gender_filtered[df_gender_filtered["Geschlecht"] == "Männlich"]["Score Change"].dropna()
female_scores = df_gender_filtered[df_gender_filtered["Geschlecht"] == "Weiblich"]["Score Change"].dropna()

male_mean = male_scores.mean()
male_std = male_scores.std()
female_mean = female_scores.mean()
female_std = female_scores.std()

print("Mean and Standard Deviation of Score Improvement by Gender:")
print(f"Männlich: Mean = {male_mean:.2f}, SD = {male_std:.2f}")
print(f"Weiblich: Mean = {female_mean:.2f}, SD = {female_std:.2f}\n")

mannwhitney_stat, mannwhitney_p = mannwhitneyu(male_scores, female_scores, alternative="two-sided")

spearman_age_corr, spearman_age_p = spearmanr(df_new["Alter"], df_new["Score Change"], nan_policy="omit")

plt.figure(figsize=(8, 2))
sns.boxplot(data=df_gender_filtered, x="Score Change", y="Geschlecht")
plt.axvline(0, color="red", linestyle="--")
plt.title("Score Improvement by Gender")
plt.xlabel("Score Improvement (%)")
plt.ylabel("Gender")
plt.show()

plt.figure(figsize=(8, 5))
sns.scatterplot(data=df_new, x="Alter", y="Score Change", alpha=0.6)
plt.axhline(0, color="red", linestyle="--")
plt.title("Scatterplot of Age vs. Score Improvement")
plt.xlabel("Age")
plt.ylabel("Score Improvement (%)")
plt.show()

print("Mann-Whitney U Test (Gender Differences in Score Improvement):")
print(f"Statistic: {mannwhitney_stat:.4f}, p-value: {mannwhitney_p:.4f} "
      f"{'(Significant)' if mannwhitney_p < 0.05 else '(Not Significant)'}\n")

print("Spearman Correlation (Age vs. Score Improvement):")
print(f"Correlation: {spearman_age_corr:.4f}, p-value: {spearman_age_p:.4f} "
      f"{'(Significant)' if spearman_age_p < 0.05 else '(Not Significant)'}\n")
Mean and Standard Deviation of Score Improvement by Gender:
Männlich: Mean = -3.16, SD = 17.59
Weiblich: Mean = 11.15, SD = 14.40

No description has been provided for this image
No description has been provided for this image
Mann-Whitney U Test (Gender Differences in Score Improvement):
Statistic: 102.0000, p-value: 0.0047 (Significant)

Spearman Correlation (Age vs. Score Improvement):
Correlation: -0.0840, p-value: 0.5834 (Not Significant)

Interpretation: Age doesn't matter, but gender makes a highly significant difference!

Demographics¶

In [34]:
import matplotlib.pyplot as plt
import seaborn as sns

# Compute and print gender counts, mean, and standard deviation of age
gender_counts = df_gender_filtered["Geschlecht"].value_counts()
mean_age = df_new["Alter"].mean()
std_age = df_new["Alter"].std()

print("Gender Distribution:")
for gender, count in gender_counts.items():
    print(f"{gender}: {count} students")
print(f"\nMean Age: {mean_age:.2f} years")
print(f"Standard Deviation of Age: {std_age:.2f} years\n")

# Plot Age Distribution
plt.figure(figsize=(8, 2))
sns.histplot(df_new["Alter"].dropna(), bins=range(int(df_new["Alter"].min()), int(df_new["Alter"].max()) + 2), 
             kde=False, color="blue", discrete=True)
plt.xticks(range(int(df_new["Alter"].min()), int(df_new["Alter"].max()) + 1))
plt.title("Age Distribution of Students")
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()

# Prepare and plot Gender Distribution
gender_counts_df = gender_counts.reset_index()
gender_counts_df.columns = ["Geschlecht", "Count"]

plt.figure(figsize=(4, 2))
sns.barplot(data=gender_counts_df, x="Geschlecht", y="Count", hue="Geschlecht", palette=["blue", "pink"], legend=False)
plt.title("Gender Distribution")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()
Gender Distribution:
Männlich: 25 students
Weiblich: 17 students

Mean Age: 14.22 years
Standard Deviation of Age: 1.66 years

No description has been provided for this image
No description has been provided for this image
In [ ]: